library(sas7bdat)
library(glmnet)
library(rpart)
library(party)
library(partykit)
library(gbm)
library(randomForest)
library(fastDummies)

s1 <- read.sas7bdat(file="/netapp2/home/rjd48/CHF/2018/final_analytic.sas7bdat")

#convert numeric to factor- for number of hf drug classes, number of total and HF hospitalizations- we need these as class variables

s1$n_hf_drug_classes= as.factor(s1$n_hf_drug_classes)
s1$prior_all_hosp= as.factor(s1$prior_all_hosp)
s1$prior_chf_hosp= as.factor(s1$prior_chf_hosp)

#rename some variables 

names(s1)[44] = "rx_loop_diuretic" #more informative name
names(s1)[39] = "rx_mra" #more informative name
names(s1)[46] = "rx_potassium_sparing" #more informative name

#glmnet for Lasso does not take categorical variables, so create dummies

s1 <- fastDummies::dummy_cols(s1, select_columns = "Hf_class", remove_first_dummy = TRUE)
s1 <- fastDummies::dummy_cols(s1, select_columns = "n_hf_drug_classes", remove_first_dummy = TRUE)
s1 <- fastDummies::dummy_cols(s1, select_columns = "prior_all_hosp", remove_first_dummy = TRUE)
s1 <- fastDummies::dummy_cols(s1, select_columns = "prior_chf_hosp", remove_first_dummy = TRUE)
s1 <- fastDummies::dummy_cols(s1, select_columns = "bnp", remove_first_dummy = TRUE)
s1 <- fastDummies::dummy_cols(s1, select_columns = "BMI_class", remove_first_dummy = TRUE)

train <- s1[s1$primary_hosp %in% c('mgh'),]
test <- s1[s1$primary_hosp %in% c('bwh'),]

##############################################################################################################

## METHOD 1- LASSO http://web.stanford.edu/~hastie/glmnet/glmnet_beta.html#log

# we perform variable selection with lasso using 10 fold cross validation on the dataset, the regularization parameters
# are selected in a way in which deviance is minimized

#############################################################################################################

predictors= as.matrix(train[c(2:53, 79:91)])

#plot for the full model

fit = glmnet(predictors, train$chf_hosp_365 ,family = "binomial")
plot(fit, xvar = "dev", label = TRUE)

#cross validation to select lambda. default is 10 fold
#alpha=0 is for ridge, 1 for lasso. nlambda  the number of values of λ is recommended to be 100 (default) or more
set.seed(8888)
cvfit = cv.glmnet(predictors, train$chf_hosp_365 , alpha=1, family = "binomial", type.measure = "deviance", nlambda=100, nfolds = 10) 

plot(cvfit)

#Make predictions using the testing data set

test_input <- as.data.frame(test[c(2:53, 79:91)])

lasso.pred1 <- predict(cvfit, type="response", newx = data.matrix(test_input), s= "lambda.min")

##attach predicted probabilities to the dataset

test1 <- cbind (test, lasso.pred1)

colnames(test1)[colnames(test1)=="1"] <- "pred_prob_lasso"


##############################################################################################################

#METHOD 2- CART 

# Conditional framework- https://cran.r-project.org/web/packages/party/vignettes/party.pdf

##############################################################################################################

predictor_vars= train[c(2:53, 79:91, 66)]

ct <- ctree(as.factor(chf_hosp_365 ) ~ .,data = predictor_vars,
            control= ctree_control(minsplit = 10)) # the err= value shows misclassification rate when chf_hosp_365  variable is a factor and sum squared error when modeled as numeric

#Check the rules

print(ct)

# summarize the probabilities at each terminal node 

pred <- aggregate(predict(ct, type = "prob"),
                  list(predict(ct, type = "node")), FUN = mean)

#create a plot ie the CART based on this information

ct_node <- as.list(ct$node)
for(i in 1:nrow(pred)) {
  ct_node[[pred[i,1]]]$info$prediction <- paste(
    format(names(pred)[-1]),
    format(round(pred[i, -1], digits = 3), nsmall = 3)
  )
}
ct$node <- as.partynode(ct_node)

cart <- plot(ct, terminal_panel = node_terminal, tp_args = list(
  FUN = function(node) c("Probability", node$prediction)))

#Make predictions using the test data set

ctree.pred <- predict(ct, newdata= test, type="prob")

##attach predicted probabilities to the test dataset

test2 <- cbind (test1, ctree.pred)

colnames(test2)[colnames(test2)=="1"] <- "pred_prob_cart"

##############################################################################################################

#METHOD 3- REGULAR LOGISTIC REGRESSION WITHOUT ANY VARIABLE SELECTION

#############################################################################################################

predictor_vars= train[c(2:57, 66)]

logreg <- glm(chf_hosp_365  ~ ., data=predictor_vars, family=binomial)
summary(logreg)

#Make predictions using the test data set

test2$pred_prob_logreg	<- predict(logreg,	newdata=test2, type	= "response")

##############################################################################################################

#METHOD 4- Gradient boosted model

#############################################################################################################

predictor_vars= train[c(2:57, 66)]
set.seed(555)
boosted <- gbm(chf_hosp_365  ~ ., data=predictor_vars, distribution = "bernoulli",n.trees = 10000,
               shrinkage = 0.01, interaction.depth = 4, cv.folds = 10)

optimal_trees_gbm <- gbm.perf(boosted, plot.it = TRUE, overlay = TRUE, method="cv")

# relative influence plots to see which variables are most predictive
par(mar = c(5,10,4,2))
summary(boosted, cBars = 20, method = relative.influence, las = 1, n.trees=optimal_trees_gbm, xlim= c(0, 20), 
        main="HFH-claims only model relative influence plot") #las option labels horizontally on y axis

## in the testing data

pred_prob_gbm <- predict(boosted, newdata = test, n.trees = optimal_trees_gbm, type = "response")

test3 <- cbind (test2, pred_prob_gbm)


##############################################################################################################

#METHOD 5- Random Forest

#############################################################################################################

predictor_vars= train[c(2:57, 66)]
indep_vars = train[c(2:57)]
set.seed(555)
RF <- randomForest(as.factor(chf_hosp_365) ~ ., data=predictor_vars, ntree=500)

plot(RF) #out of bag error rate based on out of bag portion of the data (ie from observations not selected randomly)
varImpPlot(RF) #variable influence plot

# predict in testing data

pred_prob_rf1 <- (predict(RF, newdata = test, type = "prob")) #gives out a non-numeric matrix
pred_prob_random <- unname(pred_prob_rf1[,2]) # only keep probability of outcome=1

test4 <- cbind(test3, pred_prob_random)

##############################################################################################################

## Assess performance

##############################################################################################################

outcome=test4$chf_hosp_365

#validate predictions from the rms package, will be used for Brier score and calibration plots
library (rms)

logistic_val <- val.prob(p= test4$pred_prob_logreg, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F, pl=F)
lasso_val <- val.prob(p= test4$pred_prob_lasso, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F, pl=F)
cart_val <- val.prob(p= test4$pred_prob_cart, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F, pl=F)
RF_val <- val.prob(p= test4$pred_prob_random, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F, pl=F)
gbm_val <- val.prob(p= test4$pred_prob_gbm, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F, pl=F)

#Metric 1- Brier scores for reporting 

logistic_val["Brier"]
lasso_val["Brier"]
cart_val["Brier"]
RF_val["Brier"]
gbm_val["Brier"]

# Metric 2- AUC with 95% CI, DeLong tests

library(pROC)

rocobj1 <- roc(outcome, test4$pred_prob_logreg, ci=TRUE, of="auc")
rocobj2 <- roc(outcome, test4$pred_prob_lasso, ci=TRUE, of="auc")
rocobj3 <- roc(outcome, test4$pred_prob_cart, ci=TRUE, of="auc")
rocobj4 <- roc(outcome, test4$pred_prob_random, ci=TRUE, of="auc")
rocobj5 <- roc(outcome, test4$pred_prob_gbm, ci=TRUE, of="auc")

roc.test(rocobj1, rocobj2, method="d")
roc.test(rocobj1, rocobj3, method="d")
roc.test(rocobj1, rocobj4, method="d")
roc.test(rocobj1, rocobj5, method="d")

#Metric 3- calibration plots (slope and intercept reported on the plots)

logistic_slope <-logistic_val["Slope"]
lasso_slope <-lasso_val["Slope"]
cart_slope <-cart_val["Slope"]
RF_slope <-RF_val["Slope"]
gbm_slope <-gbm_val["Slope"]

logistic_int <-logistic_val["Intercept"]
lasso_int <-lasso_val["Intercept"]
cart_int <-cart_val["Intercept"]
RF_int <-RF_val["Intercept"]
gbm_int <-gbm_val["Intercept"]

#plot them

par(mfrow=c(2,3))
par(cex=1.0)
logistic_val <- val.prob(p= test4$pred_prob_logreg, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F)+title("Logistic regression")+
  text(0.1, 0.77, "Slope ") +text (0.23,0.77, round(logistic_slope, 3)) +  text(0.1, 0.69, "Intercept ") +text (0.23,0.69, round(logistic_int, 3)) 

lasso_val <- val.prob(p= test4$pred_prob_lasso, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F)+title("LASSO")+
  text(0.1, 0.77, "Slope ") +text (0.23,0.77, round(lasso_slope, 3)) +  text(0.1, 0.69, "Intercept ") +text (0.23,0.69, round(lasso_int, 3)) 

cart_val <- val.prob(p= test4$pred_prob_cart, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F)+title("CART")+
  text(0.1, 0.77, "Slope ") +text (0.23,0.77, round(cart_slope, 3)) +  text(0.1, 0.69, "Intercept ") +text (0.23,0.69, round(cart_int, 3)) 

RF_val <- val.prob(p= test4$pred_prob_random, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F)+title("Random Forest")+
  text(0.1, 0.77, "Slope ") +text (0.23,0.77, round(RF_slope, 3)) +  text(0.1, 0.69, "Intercept ") +text (0.23,0.69, round(RF_int, 3)) 

gbm_val <- val.prob(p= test4$pred_prob_gbm, y= outcome, g=10, smooth=T,  logistic.cal = F, legendloc= F, statloc = F)+title("GBM")+
  text(0.1, 0.77, "Slope ") +text (0.23,0.77, round(gbm_slope, 3)) +  text(0.1, 0.69, "Intercept ") +text (0.23,0.69, round(gbm_int, 3)) 

par(cex=1.2)
title("
      Heart failure hospitalization- Claims only", outer=T)

dev.off()
